//
// Project: Disagreement in science: Missing women

clear all
version 15.1  



//
// AER

// call data
use "${data}/output/aer_data_gender.dta", clear
drop if month=="May" & year!=2019  // exclude AEA papers and proceedings
drop if year==2020  
keep if (comment | research_article)

// period of study
tab year

// number of article-author observations with known gender
sum article_id if female_genderize!=.
scalar bign1 = r(N)

// number of publications (research articles and comments only)
preserve
bysort article_id: drop if _n>1
sum article_id
scalar n1 = r(N)
restore

// number of unique authors
preserve
collapse (firstnm) female_ssa female_genderize female_manual, by(full_name)
tab female_ssa, m
scalar m1 = r(N)

// percentage unique authors with gender assigned using SSA birth records
tab female_ssa, m
local n_authors = r(N)
tab female_ssa
local n_identified = r(N)
scalar au1_1 = `n_identified'/`n_authors'

// percentage unique authors with gender assigned using Genderize.io service
tab female_genderize, m
local n_authors = r(N)
tab female_genderize
local n_identified = r(N)
scalar au1_2 = `n_identified'/`n_authors'

// percentage unique authors with gender assigned manually
tab female_manual, m
local n_authors = r(N)
tab female_manual
local n_identified = r(N)
scalar au1_3 = `n_identified'/`n_authors'
restore

// fraction of author-publications obsevations with female author using SSA birth records
sum female_ssa
scalar f1_1 = r(mean)

// fraction of author-publications obsevations with female author using Genderize.io service
sum female_genderize
scalar f1_2 = r(mean)

// fraction of author-publications obsevations with female author manual
sum female_manual
scalar f1_3 = r(mean)

// distribution of research articles and comments
collapse (firstnm) type comment reply letter research_article, by(article_id)
sum comment
scalar c1 = r(mean)



//
// ASR

// call data
use "${data}/output/asr_data_gender.dta", clear
keep if (comment | research_article)

// period of study
tab year

// number of article-author observations with known gender
sum article_id if female_genderize!=.
scalar bign2 = r(N)

// number of publications (research articles and comments only)
preserve
bysort article_id: drop if _n>1
sum article_id
scalar n2 = r(N)
restore

// number of unique authors
preserve
collapse (firstnm) female_ssa female_genderize, by(full_name)
tab female_ssa, m
scalar m2 = r(N)

// percentage unique authors with gender assigned using SSA birth records
tab female_ssa, m
local n_authors = r(N)
tab female_ssa
local n_identified = r(N)
scalar au2_1 = `n_identified'/`n_authors'

// percentage unique authors with gender assigned using Genderize.io service
tab female_genderize, m
local n_authors = r(N)
tab female_genderize
local n_identified = r(N)
scalar au2_2 = `n_identified'/`n_authors'
restore

// fraction of author-publications obsevations with female author using SSA birth records
sum female_ssa
scalar f2_1 = r(mean)

// fraction of author-publications obsevations with female author using Genderize.io service
sum female_genderize
scalar f2_2 = r(mean)

// distribution of research articles and comments
collapse (firstnm) type comment reply letter research_article, by(article_id)
sum comment
scalar c2 = r(mean)



//
// JAMA

// call data
use "${data}/output/jama_pubmed_data_gender.dta", clear
drop if year==2020  
drop if year<2002  // full author names from PubMed not available
drop if year<2013  // Comment & Response section started in July 2013
drop if year==2013 & month=="January"  // Comment & Response section started in July 2013
drop if year==2013 & month=="February"  // Comment & Response section started in July 2013
drop if year==2013 & month=="March"  // Comment & Response section started in July 2013
drop if year==2013 & month=="April"  // Comment & Response section started in July 2013
drop if year==2013 & month=="May"  // Comment & Response section started in July 2013
drop if year==2013 & month=="June"  // Comment & Response section started in July 2013 
keep if comment | research_article
drop if article_with_etal
drop if strpos(full_name, "Fontanarosa")  // this JAMA editor appeared as first author of letters to the editor

// period of study
tab year  

// number of article-author observations with known gender
sum article_id if female_genderize!=.
scalar bign3 = r(N)

// number of publications (research articles and comments only)
preserve
bysort article_id: drop if _n>1
sum article_id
scalar n3 = r(N)
restore

// number of unique authors
preserve
collapse (firstnm) female_ssa female_genderize, by(full_name)
tab female_ssa, m
scalar m3 = r(N)

// percentage unique authors with gender assigned using SSA birth records
tab female_ssa, m
local n_authors = r(N)
tab female_ssa
local n_identified = r(N)
scalar au3_1 = `n_identified'/`n_authors'

// percentage unique authors with gender assigned using Genderize.io service
tab female_genderize, m
local n_authors = r(N)
tab female_genderize
local n_identified = r(N)
scalar au3_2 = `n_identified'/`n_authors'
restore

// fraction of author-publications obsevations with female author using SSA birth records
sum female_ssa
scalar f3_1 = r(mean)

// fraction of author-publications obsevations with female author using Genderize.io service
sum female_genderize
scalar f3_2 = r(mean)

// distribution of research articles and comments
collapse (firstnm) type comment reply letter research_article, by(article_id)
sum comment
scalar c3 = r(mean)



//
// Nature

// call data
use "${data}/output/nature_data_gender.dta", clear
drop if year==2020
keep if comment | research_article

// period of study
tab year  

// number of article-author observations with known gender
sum article_id if female_genderize!=.
scalar bign4 = r(N)

// number of publications (research articles and comments only)
preserve
bysort article_id: drop if _n>1
sum article_id
scalar n4 = r(N)
restore

// number of unique authors
preserve
collapse (firstnm) female_ssa female_genderize, by(full_name)
tab female_ssa, m
scalar m4 = r(N)

// percentage unique authors with gender assigned using SSA birth records
tab female_ssa, m
local n_authors = r(N)
tab female_ssa
local n_identified = r(N)
scalar au4_1 = `n_identified'/`n_authors'

// percentage unique authors with gender assigned using Genderize.io service
tab female_genderize, m
local n_authors = r(N)
tab female_genderize
local n_identified = r(N)
scalar au4_2 = `n_identified'/`n_authors'
restore

// fraction of author-publications obsevations with female author using SSA birth records
sum female_ssa
scalar f4_1 = r(mean)

// fraction of author-publications obsevations with female author using Genderize.io service
sum female_genderize
scalar f4_2 = r(mean)

// distribution of research articles and comments
collapse (firstnm) type comment reply letter research_article, by(article_id)
sum comment
scalar c4 = r(mean)



//
// PNAS 

// call data
use "${data}/output/pnas_data_gender.dta", clear
drop if full_name=="II" | full_name=="III" | full_name=="IV" | full_name=="Jr" | full_name=="Jr."  // erroneously scraped as separate author-article observations
drop if year==2020 | year<2008  // PNAS started comments in 2008
keep if comment | research_article

// period of study
tab year  

// number of article-author observations with known gender
sum article_id if female_genderize!=.
scalar bign5 = r(N)

// number of publications (research articles and comments only)
preserve
bysort article_id: drop if _n>1
sum article_id
scalar n5 = r(N)
restore

// number of unique authors
preserve
collapse (firstnm) female_ssa female_genderize, by(full_name)
tab female_ssa, m
scalar m5 = r(N)

// percentage unique authors with gender assigned using SSA birth records
tab female_ssa, m
local n_authors = r(N)
tab female_ssa
local n_identified = r(N)
scalar au5_1 = `n_identified'/`n_authors'

// percentage unique authors with gender assigned using Genderize.io service
tab female_genderize, m
local n_authors = r(N)
tab female_genderize
local n_identified = r(N)
scalar au5_2 = `n_identified'/`n_authors'
restore

// fraction of author-publications obsevations with female author using SSA birth records
sum female_ssa
scalar f5_1 = r(mean)

// fraction of author-publications obsevations with female author using Genderize.io service
sum female_genderize
scalar f5_2 = r(mean)

// distribution of research articles and comments
collapse (firstnm) type comment reply letter research_article, by(article_id)
sum comment
scalar c5 = r(mean)



//
// Science

// call data
use "${data}/output/science_data_gender.dta", clear
drop if year==2020
keep if comment | research_article

// period of study
tab year  

// number of article-author observations with known gender
sum article_id if female_genderize!=.
scalar bign6 = r(N)

// number of publications (research articles and comments only)
preserve
bysort article_id: drop if _n>1
sum article_id
scalar n6 = r(N)
restore

// number of unique authors
preserve
collapse (firstnm) female_ssa female_genderize, by(full_name)
tab female_ssa, m
scalar m6 = r(N)

// percentage unique authors with gender assigned using SSA birth records
tab female_ssa, m
local n_authors = r(N)
tab female_ssa
local n_identified = r(N)
scalar au6_1 = `n_identified'/`n_authors'

// percentage unique authors with gender assigned using Genderize.io service
tab female_genderize, m
local n_authors = r(N)
tab female_genderize
local n_identified = r(N)
scalar au6_2 = `n_identified'/`n_authors'
restore

// fraction of author-publications obsevations with female author using SSA birth records
sum female_ssa
scalar f6_1 = r(mean)

// fraction of author-publications obsevations with female author using Genderize.io service
sum female_genderize
scalar f6_2 = r(mean)

// distribution of research articles and comments
collapse (firstnm) type comment reply letter research_article, by(article_id)
sum comment
scalar c6 = r(mean)




